Module 13. Elements of Regression Analysis

Emiley Garcia-Zych

#install necessary packages 
##install.packages("curl")
##install.packages("car")
library(curl)
## Using libcurl 8.1.2 with LibreSSL/3.3.6
f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN588_Fall23/zombies.csv")
d <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = FALSE)
m <- lm(data = d, height ~ weight)
SSY <- sum((m$model$height - mean(m$model$height))^2)  # height - mean(height)
SSY
## [1] 18558.61
SSR <- sum((m$fitted.values - mean(m$model$height))^2)  # predicted height - mean height
SSR
## [1] 12864.82
SSE <- sum((m$model$height - m$fitted.values)^2)  # height - predicted height
SSE
## [1] 5693.785
df_regression <- 1
df_error <- 998
df_y <- 999
MSR <- SSR/df_regression
MSE <- SSE/df_error
MSY <- SSY/df_y
fratio <- MSR/MSE
fratio
## [1] 2254.931
curve(df(x, df = 1, df2 = 1), col = "green", lty = 3, lwd = 2, xlim = c(0, 10),
    main = "Some Example F Distributions\n(vertical line shows critical value for df1=1,df2=998)",
    ylab = "f(x)", xlab = "x")
curve(df(x, df = 2, df2 = 2), col = "blue", lty = 3, lwd = 2, add = TRUE)
curve(df(x, df = 4, df2 = 4), col = "red", lty = 3, lwd = 2, add = TRUE)
curve(df(x, df = 8, df2 = 6), col = "purple", lty = 3, lwd = 2, add = TRUE)
curve(df(x, df = 1, df2 = 998), col = "black", lwd = 3, add = TRUE)
legend("top", c("df1=1,df2=1", "df1=2,df2=2", "df1=4,df2=4", "df1=8,df2=6",
    "df1=1,df2=998"), lty = 3, lwd = 2, col = c("green", "blue", "red", "purple",
    "black"), bty = "n", cex = 0.75)

fcrit <- qf(p = 0.95, df1 = 1, df2 = 998)
fcrit
## [1] 3.850793
a <- aov(data = d, height ~ weight)
summary(a)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## weight        1  12865   12865    2255 <2e-16 ***
## Residuals   998   5694       6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(m)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## weight        1  12865   12865    2255 <2e-16 ***
## Residuals   998   5694       6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rsquared <- SSR/SSY
rsquared
## [1] 0.6931998
rho <- sqrt(rsquared)
rho
## [1] 0.8325862
SSX <- sum((m$model$weight - mean(m$model$weight))^2)
SEbeta1 <- sqrt(MSE/SSX)
SEbeta1
## [1] 0.004106858
SEbeta0 <- sqrt((MSE * sum(m$model$weight^2))/(1000 * SSX))
SEbeta0
## [1] 0.5958147
SEyhat <- sqrt(MSE * (1/1000 + (m$model$weight - mean(m$model$weight))^2/SSX))
head(SEyhat)  # just the first 6 rows
## [1] 0.08978724 0.07620966 0.08414480 0.09533986 0.08904151 0.08341218
summary(m)
## 
## Call:
## lm(formula = height ~ weight, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.1519 -1.5206 -0.0535  1.5167  9.4439 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.565446   0.595815   66.41   <2e-16 ***
## weight       0.195019   0.004107   47.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.389 on 998 degrees of freedom
## Multiple R-squared:  0.6932, Adjusted R-squared:  0.6929 
## F-statistic:  2255 on 1 and 998 DF,  p-value: < 2.2e-16

Challenge 1

Calculate the residuals from the regression of zombie height on weight and plot these in relation to weight (the x variable). There are lots of ways to do this quickly.

m <- lm(data = d, height ~ weight)
plot(x = d$weight, y = m$residuals)

# or
e <- resid(m)
plot(x = d$weight, y = e)

hist(e, xlim = c(-4 * sd(e), 4 * sd(e)), breaks = 20, main = "Histogram of Residuals")

plot(m$model$weight, m$residuals)

par(mfrow = c(2, 2))
qqnorm(m$residuals)

library(car)
## Loading required package: carData
qqPlot(m$residuals)

## [1] 954 799
s <- shapiro.test(m$residuals)
s
## 
##  Shapiro-Wilk normality test
## 
## data:  m$residuals
## W = 0.99713, p-value = 0.07041

Challenge 2

Load in the "KamilarAndCooper.csv" dataset and develop a linear model to look at the relationship between "weaning age" and "female body mass". You will probably need to look at the data and variable names again to find the appropriate variables to examine.

  • Using the procedures outlined above and in Module 12, calculate estimates of β0 and β1 by hand *and using the lm() function. Are the regression coefficients estimated under a simple linear model statistically significantly different from zero?

  • Construct an ANOVA table by hand and compare your values to the results of running lm()and then looking at summary.aov(lm()).

  • Generate the residuals for your linear model by hand, plot them in relation to female body weight, and make a histogram of the residuals. Do they appear to be normally distributed?

  • Run the plot() command on the result of lm() and examine the 4 plots produced. Again, based on examination of the residuals and the results of Shapiro-Wilks test, does it look like your model has good fit?

f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN588_Fall23/KamilarAndCooperData.csv")
d <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = FALSE)
head(d)
##               Scientific_Name          Family          Genus      Species
## 1 Allenopithecus_nigroviridis Cercopithecidae Allenopithecus nigroviridis
## 2         Allocebus_trichotis Cercopithecidae      Allocebus    trichotis
## 3           Alouatta_belzebul        Atelidae       Alouatta     belzebul
## 4             Alouatta_caraya        Atelidae       Alouatta       caraya
## 5            Alouatta_guariba        Atelidae       Alouatta      guariba
## 6           Alouatta_palliata        Atelidae       Alouatta     palliata
##   Brain_Size_Species_Mean Brain_Size_Female_Mean   Brain_size_Ref
## 1                   58.02                  53.70 Isler et al 2008
## 2                      NA                     NA                 
## 3                   52.84                  51.19 Isler et al 2008
## 4                   52.63                  47.80 Isler et al 2008
## 5                   51.70                  49.08 Isler et al 2008
## 6                   49.88                  48.04 Isler et al 2008
##   Body_mass_male_mean Body_mass_female_mean Mass_Dimorphism
## 1                6130                  3180           1.928
## 2                  92                    84           1.095
## 3                7270                  5520           1.317
## 4                6525                  4240           1.539
## 5                5800                  4550           1.275
## 6                7150                  5350           1.336
##                 Mass_Ref MeanGroupSize AdultMales AdultFemale AdultSexRatio
## 1       Isler et al 2008            NA         NA          NA            NA
## 2 Smith and Jungers 1997          1.00       1.00         1.0            NA
## 3       Isler et al 2008          7.00       1.00         1.0          1.00
## 4       Isler et al 2008          8.00       2.30         3.3          1.43
## 5       Isler et al 2008          6.53       1.37         2.2          1.61
## 6       Isler et al 2008         12.00       2.90         6.3          2.17
##                                                     Social_Organization_Ref
## 1                                                                          
## 2                                                             Kappeler 1997
## 3                                                       Campbell et al 2007
## 4 van Schaik et al. 1999; Kappeler and Pereira 2003; Nunn & van Schaik 2000
## 5                                                       Campbell et al 2007
## 6 van Schaik et al. 1999; Kappeler and Pereira 2003; Nunn & van Schaik 2000
##   InterbirthInterval_d Gestation WeaningAge_d MaxLongevity_m LitterSz
## 1                   NA        NA       106.15          276.0     1.01
## 2                   NA        NA           NA             NA     1.00
## 3                   NA        NA           NA             NA       NA
## 4               337.62       187       323.16          243.6     1.01
## 5                   NA        NA           NA             NA       NA
## 6               684.37       186       495.60          300.0     1.02
##    Life_History_Ref GR_MidRangeLat_dd Precip_Mean_mm Temp_Mean_degC AET_Mean_mm
## 1 Jones et al. 2009             -0.17         1574.0           25.2      1517.8
## 2                              -16.59         1902.3           20.3      1388.2
## 3                               -6.80         1643.5           24.9      1286.6
## 4 Jones et al. 2009            -20.34         1166.4           22.9      1193.1
## 5                              -21.13         1332.3           19.6      1225.7
## 6 Jones et al. 2009              6.95         1852.6           23.7      1300.0
##   PET_Mean_mm       Climate_Ref HomeRange_km2      HomeRangeRef DayLength_km
## 1      1589.4 Jones et al. 2009            NA                             NA
## 2      1653.7 Jones et al. 2009            NA                             NA
## 3      1549.8 Jones et al. 2009            NA                             NA
## 4      1404.9 Jones et al. 2009            NA                           0.40
## 5      1332.2 Jones et al. 2009          0.03 Jones et al. 2009           NA
## 6      1633.9 Jones et al. 2009          0.19 Jones et al. 2009         0.32
##       DayLengthRef Territoriality Fruit Leaves Fauna             DietRef1
## 1                              NA    NA                                  
## 2                              NA    NA                                  
## 3                              NA  57.3   19.1   0.0 Campbell et al. 2007
## 4 Nunn et al. 2003             NA  23.8   67.7   0.0 Campbell et al. 2007
## 5                              NA   5.2   73.0   0.0 Campbell et al. 2007
## 6 Nunn et al. 2003         0.6506  33.1   56.4   0.0 Campbell et al. 2007
##   Canine_Dimorphism Canine_Dimorphism_Ref  Feed  Move  Rest Social
## 1             2.210   Plavcan & Ruff 2008    NA    NA    NA     NA
## 2                NA                          NA    NA    NA     NA
## 3             1.811   Plavcan & Ruff 2008 13.75 18.75 57.30  10.00
## 4             1.542   Plavcan & Ruff 2008 15.90 17.60 61.60   4.90
## 5             1.783   Plavcan & Ruff 2008 18.33 14.33 64.37   3.00
## 6             1.703   Plavcan & Ruff 2008 17.94 12.32 66.14   3.64
##    Activity_Budget_Ref
## 1                     
## 2                     
## 3 Campbell et al. 2007
## 4 Campbell et al. 2007
## 5 Campbell et al. 2007
## 6 Campbell et al. 2007
plot(data = d, WeaningAge_d ~ Body_mass_female_mean)

model <- lm(data = d, WeaningAge_d ~ Body_mass_female_mean)
summary(model)
## 
## Call:
## lm(formula = WeaningAge_d ~ Body_mass_female_mean, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -720.39 -115.48  -54.05   58.11  471.22 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.016e+02  2.012e+01   10.02   <2e-16 ***
## Body_mass_female_mean 2.013e-02  1.927e-03   10.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 183.4 on 114 degrees of freedom
##   (97 observations deleted due to missingness)
## Multiple R-squared:  0.489,  Adjusted R-squared:  0.4845 
## F-statistic: 109.1 on 1 and 114 DF,  p-value: < 2.2e-16
plot(model)

qqPlot(model$residuals)

## 97 44 
## 48 24
s <- shapiro.test(model$residuals)
s
## 
##  Shapiro-Wilk normality test
## 
## data:  model$residuals
## W = 0.86291, p-value = 5.825e-09

Challenge 3

Return to the "KamilarAndCooper.csv" dataset you were looking at above and log transform both of your variables and then run a simple bivariate linear model. Do you notice a difference between these results and those obtained using untransformed variables?

d$logWeaningAge <- log(d$WeaningAge_d)
d$logFemaleBodyMass <- log(d$Body_mass_female_mean)
plot(data = d, logWeaningAge ~ logFemaleBodyMass)

model <- lm(data = d, logWeaningAge ~ logFemaleBodyMass)
summary(model)
## 
## Call:
## lm(formula = logWeaningAge ~ logFemaleBodyMass, data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10639 -0.32736  0.00848  0.32214  1.11010 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.7590     0.2196   8.011 1.08e-12 ***
## logFemaleBodyMass   0.4721     0.0278  16.983  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4532 on 114 degrees of freedom
##   (97 observations deleted due to missingness)
## Multiple R-squared:  0.7167, Adjusted R-squared:  0.7142 
## F-statistic: 288.4 on 1 and 114 DF,  p-value: < 2.2e-16
plot(model)

s <- shapiro.test(model$residuals)
s
## 
##  Shapiro-Wilk normality test
## 
## data:  model$residuals
## W = 0.99367, p-value = 0.8793
par(mfrow = c(1, 2))

a <- 2
b <- 2

# log x
x <- seq(from = 0, to = 100, length.out = 1000)
y <- a + b * log(x)
plot(x, y, type = "l", main = "untransformed")
plot(log(x), y, type = "l", main = "log(x)")

# log y
x <- seq(from = 0, to = 10, length.out = 1000)
y <- exp(a + b * x)
plot(x, y, type = "l", main = "untransformed")

plot(x, log(y), type = "l", main = "log(y)")

# assymptotic
x <- seq(from = 1, to = 100, length.out = 100)
y <- (a * x)/(1 + b * x)
plot(x, y, type = "l", main = "untransformed")

plot(1/x, y, type = "l", main = "1/x")

# reciprocal
x <- seq(from = 1, to = 100, length.out = 100)
y <- a + b/x
plot(x, y, type = "l", main = "untransformed")

plot(1/x, y, type = "l", main = "1/x")

# exp
x <- seq(from = 1, to = 10, length.out = 100)
y <- a * exp(b * x)
plot(x, y, type = "l", main = "untransformed")

plot(x, log(y), type = "l", main = "log(y)")